Loading needed libraries

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dbplyr)
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(ranger)
library(DHARMa)
## This is DHARMa 0.4.1. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa') Note: Syntax of plotResiduals has changed in 0.3.0, see ?plotResiduals for details
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ranger':
## 
##     importance
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

Loading the data

# Creating a list of the column names
col_names <- c("Date","Rented Bike Count","Hour", "Temperature", "Humidity", "Wind speed", "Visibility", "Dew point temperature", "Solar Radiation", "Rainfall", "Snowfall", "Seasons", "Holiday", "Functioning Day")


# creating the dataframe and affecting the column names
bike <- read.csv('SeoulBikeData.csv', header = FALSE, sep = ",", col.names = col_names, skip = 1)
bike
summary(bike$Rented.Bike.Count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   191.0   504.5   704.6  1065.2  3556.0
# Print the mean and the variance of the output 

sprintf("Variance of Y: %f", var(bike$Rented.Bike.Count))
## [1] "Variance of Y: 416021.733390"
sprintf("Mean of Y: %f", mean(bike$Rented.Bike.Count))
## [1] "Mean of Y: 704.602055"
checkWeekday <- function(dateNumber) {
  if (dateNumber %in% c(6,7))
    return (0)
  else
    return (1)
}


checkWeekEnd <- function(dateNumber) {
  if (dateNumber %in% c(6,7))
    return (1)
  else
    return (0)
}

checkHoliday <- function(holidayStatus) {
  if (holidayStatus == "No Holiday")
    return (0)
  else
    return (1)
}

# bike_with_time <- bike %>%
#   mutate(Date = dmy(Date),
#          Day = as.integer(wday(Date)),
#          Weekday = sapply(X = Day, FUN = checkWeekday),
#          WeekEnd = sapply(X = Day, FUN = checkWeekEnd),
#          Holiday = sapply(X = Holiday, FUN = checkHoliday))


# To set the language in english for the dummy variables
Sys.setlocale("LC_TIME", "en_US.UTF-8")
## [1] "en_US.UTF-8"
bike_with_time <- bike %>%
  mutate(Date = dmy(Date),
         Day = as.integer(wday(Date)),
         Month = as.character(month(Date, label= TRUE)))


bike_with_time <- bike_with_time %>%
  dplyr::select(-Date)

bike_dummy <- dummyVars(Rented.Bike.Count~., data = bike_with_time)

bike_dummy <- predict(bike_dummy, newdata = bike_with_time)

bike_encoded <- as.data.frame.matrix(bike_dummy)

colnames(bike_encoded)[colnames(bike_encoded) == "HolidayNo Holiday"] <- "NoHoliday"

bike_encoded$Rented.Bike.Count <- bike$Rented.Bike.Count

# final dataframe encoded which will be split
bike_encoded
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(splitTools)
library(ModelMetrics)
## 
## Attaching package: 'ModelMetrics'
## The following objects are masked from 'package:caret':
## 
##     confusionMatrix, precision, recall, sensitivity, specificity
## The following object is masked from 'package:base':
## 
##     kappa
# splitting of the data into a training set and a testing set
set.seed(23)
inds <- partition(bike_encoded$Rented.Bike.Count, p = c(train = 0.8, test = 0.2))
bike_train <- bike_encoded[inds$train,]
bike_test <- bike_encoded[inds$test,]

Poisson regression

Model

# poisson regression model
poisson_lm1 <- glm(Rented.Bike.Count ~., family = "poisson", data = bike_train)

summary(poisson_lm1)
## 
## Call:
## glm(formula = Rented.Bike.Count ~ ., family = "poisson", data = bike_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -49.107   -8.745   -1.413    4.940  104.801  
## 
## Coefficients: (7 not defined because of singularities)
##                         Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)            5.741e+00  8.239e-03  696.769  < 2e-16 ***
## Hour                   4.283e-02  7.847e-05  545.749  < 2e-16 ***
## Temperature            1.364e-02  3.095e-04   44.062  < 2e-16 ***
## Humidity              -1.489e-02  8.863e-05 -167.990  < 2e-16 ***
## Wind.speed             2.146e-02  5.058e-04   42.428  < 2e-16 ***
## Visibility             6.042e-05  1.163e-06   51.953  < 2e-16 ***
## Dew.point.temperature  2.747e-02  3.193e-04   86.031  < 2e-16 ***
## Solar.Radiation       -7.856e-02  6.684e-04 -117.534  < 2e-16 ***
## Rainfall              -5.002e-01  2.385e-03 -209.757  < 2e-16 ***
## Snowfall              -1.080e-01  2.163e-03  -49.948  < 2e-16 ***
## SeasonsAutumn          7.815e-01  4.915e-03  159.007  < 2e-16 ***
## SeasonsSpring          8.990e-01  4.373e-03  205.594  < 2e-16 ***
## SeasonsSummer          9.320e-01  4.774e-03  195.216  < 2e-16 ***
## SeasonsWinter                 NA         NA       NA       NA    
## HolidayHoliday        -2.139e-01  2.471e-03  -86.537  < 2e-16 ***
## NoHoliday                     NA         NA       NA       NA    
## Functioning.DayNo     -1.808e+01  1.069e+01   -1.692  0.09062 .  
## Functioning.DayYes            NA         NA       NA       NA    
## Day                    1.603e-02  2.254e-04   71.144  < 2e-16 ***
## MonthApr              -1.502e-01  2.218e-03  -67.711  < 2e-16 ***
## MonthAug              -5.911e-01  2.156e-03 -274.155  < 2e-16 ***
## MonthDec               2.455e-01  3.904e-03   62.875  < 2e-16 ***
## MonthFeb               1.087e-02  4.089e-03    2.659  0.00783 ** 
## MonthJan                      NA         NA       NA       NA    
## MonthJul              -4.083e-01  2.014e-03 -202.707  < 2e-16 ***
## MonthJun                      NA         NA       NA       NA    
## MonthMar              -3.368e-01  2.644e-03 -127.416  < 2e-16 ***
## MonthMay                      NA         NA       NA       NA    
## MonthNov               1.138e-01  3.161e-03   35.987  < 2e-16 ***
## MonthOct               2.467e-01  2.358e-03  104.630  < 2e-16 ***
## MonthSep                      NA         NA       NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3991664  on 7006  degrees of freedom
## Residual deviance: 1155312  on 6983  degrees of freedom
## AIC: 1209023
## 
## Number of Fisher Scoring iterations: 9
# creation of simulated residuals with DHARMa on the poisson regression model
simulationOutput <- simulateResiduals(fittedModel = poisson_lm1, plot = F)

# QQ plot
plotQQunif(simulationOutput)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

# Check for dispersion in the data
testDispersion(simulationOutput, plot = F)
## 
##  DHARMa nonparametric dispersion test via sd of residuals fitted vs.
##  simulated
## 
## data:  simulationOutput
## dispersion = 189, p-value < 2.2e-16
## alternative hypothesis: two.sided

Quasipoisson Regression

Model

# Quasipoisson model
poisson_lm2 <- glm(Rented.Bike.Count ~., family = "quasipoisson", data = bike_train)

summary(poisson_lm2)
## 
## Call:
## glm(formula = Rented.Bike.Count ~ ., family = "quasipoisson", 
##     data = bike_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -49.107   -8.745   -1.413    4.940  104.801  
## 
## Coefficients: (7 not defined because of singularities)
##                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            5.741e+00  3.276e+00   1.753   0.0797 .
## Hour                   4.283e-02  3.120e-02   1.373   0.1699  
## Temperature            1.364e-02  1.231e-01   0.111   0.9118  
## Humidity              -1.489e-02  3.524e-02  -0.423   0.6727  
## Wind.speed             2.146e-02  2.011e-01   0.107   0.9150  
## Visibility             6.042e-05  4.624e-04   0.131   0.8960  
## Dew.point.temperature  2.747e-02  1.270e-01   0.216   0.8287  
## Solar.Radiation       -7.856e-02  2.657e-01  -0.296   0.7675  
## Rainfall              -5.002e-01  9.481e-01  -0.528   0.5978  
## Snowfall              -1.080e-01  8.600e-01  -0.126   0.9000  
## SeasonsAutumn          7.815e-01  1.954e+00   0.400   0.6892  
## SeasonsSpring          8.990e-01  1.738e+00   0.517   0.6051  
## SeasonsSummer          9.320e-01  1.898e+00   0.491   0.6234  
## SeasonsWinter                 NA         NA      NA       NA  
## HolidayHoliday        -2.139e-01  9.826e-01  -0.218   0.8277  
## NoHoliday                     NA         NA      NA       NA  
## Functioning.DayNo     -1.808e+01  4.249e+03  -0.004   0.9966  
## Functioning.DayYes            NA         NA      NA       NA  
## Day                    1.603e-02  8.960e-02   0.179   0.8580  
## MonthApr              -1.502e-01  8.819e-01  -0.170   0.8648  
## MonthAug              -5.911e-01  8.572e-01  -0.690   0.4905  
## MonthDec               2.455e-01  1.552e+00   0.158   0.8743  
## MonthFeb               1.087e-02  1.626e+00   0.007   0.9947  
## MonthJan                      NA         NA      NA       NA  
## MonthJul              -4.083e-01  8.008e-01  -0.510   0.6102  
## MonthJun                      NA         NA      NA       NA  
## MonthMar              -3.368e-01  1.051e+00  -0.320   0.7486  
## MonthMay                      NA         NA      NA       NA  
## MonthNov               1.138e-01  1.257e+00   0.091   0.9279  
## MonthOct               2.467e-01  9.376e-01   0.263   0.7924  
## MonthSep                      NA         NA      NA       NA  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 158070.4)
## 
##     Null deviance: 3991664  on 7006  degrees of freedom
## Residual deviance: 1155312  on 6983  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 9

Negative binomial Regression

Model

# Negative binomial (NB) model
poisson_lm3 <- glm.nb(formula = Rented.Bike.Count ~ . -Rainfall, link = "log", data = bike_train)

summary(poisson_lm3)
## 
## Call:
## glm.nb(formula = Rented.Bike.Count ~ . - Rainfall, data = bike_train, 
##     link = "log", init.theta = 2.477575252)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.6769  -0.6781  -0.1236   0.3208   4.8027  
## 
## Coefficients: (7 not defined because of singularities)
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            7.239e+00  1.537e-01  47.105  < 2e-16 ***
## Hour                   3.831e-02  1.245e-03  30.764  < 2e-16 ***
## Temperature           -3.068e-02  6.129e-03  -5.006 5.56e-07 ***
## Humidity              -3.255e-02  1.696e-03 -19.200  < 2e-16 ***
## Wind.speed            -9.285e-03  8.590e-03  -1.081 0.279745    
## Visibility             7.070e-05  1.925e-05   3.673 0.000239 ***
## Dew.point.temperature  8.287e-02  6.407e-03  12.934  < 2e-16 ***
## Solar.Radiation       -3.795e-02  1.281e-02  -2.963 0.003042 ** 
## Snowfall              -7.244e-02  1.784e-02  -4.061 4.88e-05 ***
## SeasonsAutumn          5.581e-01  6.930e-02   8.054 7.99e-16 ***
## SeasonsSpring          6.861e-01  5.882e-02  11.665  < 2e-16 ***
## SeasonsSummer          7.311e-01  6.647e-02  11.000  < 2e-16 ***
## SeasonsWinter                 NA         NA      NA       NA    
## HolidayHoliday        -3.305e-01  3.675e-02  -8.993  < 2e-16 ***
## NoHoliday                     NA         NA      NA       NA    
## Functioning.DayNo     -3.009e+01  4.335e+03  -0.007 0.994462    
## Functioning.DayYes            NA         NA      NA       NA    
## Day                    2.692e-02  3.887e-03   6.926 4.33e-12 ***
## MonthApr              -1.213e-01  4.024e-02  -3.014 0.002576 ** 
## MonthAug              -6.214e-01  4.145e-02 -14.991  < 2e-16 ***
## MonthDec               2.752e-01  3.757e-02   7.324 2.41e-13 ***
## MonthFeb               9.685e-03  3.849e-02   0.252 0.801330    
## MonthJan                      NA         NA      NA       NA    
## MonthJul              -4.466e-01  4.032e-02 -11.076  < 2e-16 ***
## MonthJun                      NA         NA      NA       NA    
## MonthMar              -2.375e-01  4.327e-02  -5.488 4.07e-08 ***
## MonthMay                      NA         NA      NA       NA    
## MonthNov               2.864e-01  5.393e-02   5.310 1.10e-07 ***
## MonthOct               3.402e-01  4.452e-02   7.641 2.16e-14 ***
## MonthSep                      NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(2.4776) family taken to be 1)
## 
##     Null deviance: 23433.6  on 7006  degrees of freedom
## Residual deviance:  7246.1  on 6984  degrees of freedom
## AIC: 96567
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  2.4776 
##           Std. Err.:  0.0406 
## 
##  2 x log-likelihood:  -96518.6990
# Residual analysis of the NB-regression
simulationOutput <- simulateResiduals(fittedModel = poisson_lm3, plot = F, refit = F)

# QQ plot residuals for NB-regression
plotQQunif(simulationOutput)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

# Check the dispersion
testDispersion(simulationOutput)

## 
##  DHARMa nonparametric dispersion test via sd of residuals fitted vs.
##  simulated
## 
## data:  simulationOutput
## dispersion = 0.52772, p-value < 2.2e-16
## alternative hypothesis: two.sided
testZeroInflation(simulationOutput)

## 
##  DHARMa zero-inflation test via comparison to expected zeros with
##  simulation under H0 = fitted model
## 
## data:  simulationOutput
## ratioObsSim = 0.99968, p-value = 1
## alternative hypothesis: two.sided

Zero-inflated NB

# Zero-inflated NB regression model
zeroinfl_poisson <- zeroinfl(Rented.Bike.Count ~ . | Hour, data = bike_train, 
                             dist = "negbin", link = "log")
## Warning in value[[3L]](cond): le système est numériquement singulier :
## conditionnement de la réciproque = 3.31183e-25FALSE
summary(zeroinfl_poisson)
## 
## Call:
## zeroinfl(formula = Rented.Bike.Count ~ . | Hour, data = bike_train, dist = "negbin", 
##     link = "log")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5135 -0.6140 -0.1200  0.3881 13.7655 
## 
## Count model coefficients (negbin with log link):
##                         Estimate Std. Error z value Pr(>|z|)
## (Intercept)            7.221e+00         NA      NA       NA
## Hour                   3.937e-02         NA      NA       NA
## Temperature           -2.342e-02         NA      NA       NA
## Humidity              -2.944e-02         NA      NA       NA
## Wind.speed            -7.120e-03         NA      NA       NA
## Visibility             6.929e-05         NA      NA       NA
## Dew.point.temperature  7.435e-02         NA      NA       NA
## Solar.Radiation       -3.679e-02         NA      NA       NA
## Rainfall              -1.108e-01         NA      NA       NA
## Snowfall              -7.839e-02         NA      NA       NA
## SeasonsAutumn          4.203e-01         NA      NA       NA
## SeasonsSpring          5.120e-01         NA      NA       NA
## SeasonsSummer          6.550e-01         NA      NA       NA
## SeasonsWinter         -8.709e-02         NA      NA       NA
## HolidayHoliday        -3.858e-01         NA      NA       NA
## NoHoliday             -5.117e-02         NA      NA       NA
## Functioning.DayNo     -1.508e-12         NA      NA       NA
## Functioning.DayYes    -4.832e-02         NA      NA       NA
## Day                    2.687e-02         NA      NA       NA
## MonthApr              -8.267e-02         NA      NA       NA
## MonthAug              -6.563e-01         NA      NA       NA
## MonthDec               2.215e-01         NA      NA       NA
## MonthFeb              -3.916e-02         NA      NA       NA
## MonthJan              -4.780e-02         NA      NA       NA
## MonthJul              -4.959e-01         NA      NA       NA
## MonthJun              -3.744e-02         NA      NA       NA
## MonthMar              -2.039e-01         NA      NA       NA
## MonthMay               5.920e-02         NA      NA       NA
## MonthNov               2.863e-01         NA      NA       NA
## MonthOct               3.441e-01         NA      NA       NA
## MonthSep               1.017e-02         NA      NA       NA
## Log(theta)             9.482e-01         NA      NA       NA
## 
## Zero-inflation model coefficients (binomial with log link):
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.3909953         NA      NA       NA
## Hour         0.0003823         NA      NA       NA
## 
## Theta = 2.581 
## Number of iterations in BFGS optimization: 1 
## Log-likelihood: -4.914e+04 on 34 Df
# Zero-inflated NB-regression predictions
y_pred_zeroinfl <- predict(zeroinfl_poisson, newdata = bike_test[,-31])

rmse(y_pred_zeroinfl, bike_test[, 31])
## [1] 447.1093

Comparisons of the 3 models (Poisson)

Predictions for poisson regressions

nb_col = length(bike_train)[1]

# Poisson regression predictions
y_pred_poisson <- predict(poisson_lm1, 
                          newdata = bike_test[, 1: nb_col-1], 
                          type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
# Quasi-poisson regression predictions
y_pred_quasipoisson <- predict(poisson_lm2, 
                               newdata = bike_test[, 1:nb_col-1], 
                               type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
# Negative binomial regression predictions
y_pred_nb <- predict(poisson_lm3, 
                     newdata = bike_test[, 1:nb_col-1], 
                     type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

Try Linear Regression (Bad idea)

model_lm <- lm(Rented.Bike.Count ~. -Visibility, data = bike_train)

summary(model_lm)
## 
## Call:
## lm(formula = Rented.Bike.Count ~ . - Visibility, data = bike_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1465.52  -261.46   -46.93   205.98  1970.13 
## 
## Coefficients: (7 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            736.4365    96.0262   7.669 1.97e-14 ***
## Hour                    26.7316     0.7946  33.643  < 2e-16 ***
## Temperature             12.3524     3.9453   3.131  0.00175 ** 
## Humidity               -12.9014     1.0764 -11.986  < 2e-16 ***
## Wind.speed              25.5273     5.4512   4.683 2.88e-06 ***
## Dew.point.temperature   17.5542     4.1622   4.218 2.50e-05 ***
## Solar.Radiation        -96.2550     8.1078 -11.872  < 2e-16 ***
## Rainfall               -53.1698     4.3862 -12.122  < 2e-16 ***
## Snowfall                31.4723    11.4911   2.739  0.00618 ** 
## SeasonsAutumn          292.4838    38.8163   7.535 5.50e-14 ***
## SeasonsSpring          353.4829    35.7011   9.901  < 2e-16 ***
## SeasonsSummer          484.2193    40.2305  12.036  < 2e-16 ***
## SeasonsWinter                NA         NA      NA       NA    
## HolidayHoliday        -152.2853    23.1731  -6.572 5.33e-11 ***
## NoHoliday                    NA         NA      NA       NA    
## Functioning.DayNo     -951.2204    28.4597 -33.423  < 2e-16 ***
## Functioning.DayYes           NA         NA      NA       NA    
## Day                     12.7636     2.4900   5.126 3.04e-07 ***
## MonthApr              -149.0339    25.4825  -5.848 5.18e-09 ***
## MonthAug              -541.3169    25.4016 -21.310  < 2e-16 ***
## MonthDec                69.4644    24.2270   2.867  0.00415 ** 
## MonthFeb               -39.3966    24.8242  -1.587  0.11255    
## MonthJan                     NA         NA      NA       NA    
## MonthJul              -371.4566    25.1335 -14.779  < 2e-16 ***
## MonthJun                     NA         NA      NA       NA    
## MonthMar              -247.5485    26.9442  -9.187  < 2e-16 ***
## MonthMay                     NA         NA      NA       NA    
## MonthNov                 6.4561    29.3392   0.220  0.82584    
## MonthOct               148.6621    26.0829   5.700 1.25e-08 ***
## MonthSep                     NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 413.4 on 6984 degrees of freedom
## Multiple R-squared:  0.5923, Adjusted R-squared:  0.591 
## F-statistic: 461.1 on 22 and 6984 DF,  p-value: < 2.2e-16

Predictions

# Linear regression predictions
y_pred_lm <- predict(model_lm, newdata = bike_test[, -nb_col], type = "response")
## Warning in predict.lm(model_lm, newdata = bike_test[, -nb_col], type =
## "response"): prediction from a rank-deficient fit may be misleading

Random Forest

Model

rf_model <- randomForest(Rented.Bike.Count ~ ., data = bike_train)

Predictions

# Random Forest model predictions
y_pred_rf <- predict(rf_model, newdata = bike_test[, -nb_col])

Comparisons of the models

sprintf("Poisson regression : %1.0f", rmse(bike_test[, nb_col], y_pred_poisson))
## [1] "Poisson regression : 371"
sprintf("Quasipoisson : %1.0f", rmse(bike_test[, nb_col], y_pred_quasipoisson))
## [1] "Quasipoisson : 371"
sprintf("Negative binomiale : %1.0f",rmse(bike_test[, nb_col], y_pred_nb))
## [1] "Negative binomiale : 428"
sprintf("Linear Regression : %1.0f",rmse(bike_test[, nb_col], y_pred_lm))
## [1] "Linear Regression : 416"
sprintf("Random Forest : %1.0f",rmse(bike_test[, nb_col], y_pred_rf))
## [1] "Random Forest : 187"
# We can see that the min is negative for LR
print("Linear Regression")
## [1] "Linear Regression"
summary(y_pred_lm)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -597.9   395.1   713.2   718.7  1089.0  1842.7
print("Poisson regression")
## [1] "Poisson regression"
summary(y_pred_poisson)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   296.9   572.9   713.0  1075.0  2460.8
print("Quasipoisson regression")
## [1] "Quasipoisson regression"
summary(y_pred_quasipoisson)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   296.9   572.9   713.0  1075.0  2460.8
print("Negative binomial regression")
## [1] "Negative binomial regression"
summary(y_pred_nb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   317.2   573.2   722.6  1079.0  5368.1
print("Random Forest")
## [1] "Random Forest"
summary(y_pred_rf)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   238.8   539.8   708.1  1079.4  2860.3

Results

sprintf("Poisson regression : %.3f", rmsle(bike_test[, nb_col], y_pred_poisson))
## [1] "Poisson regression : 0.733"
sprintf("Quasipoisson : %.3f", rmsle(bike_test[, nb_col], y_pred_quasipoisson))
## [1] "Quasipoisson : 0.733"
sprintf("Negative binomiale : %.3f",rmsle(bike_test[, nb_col], y_pred_nb))
## [1] "Negative binomiale : 0.811"
sprintf("Random Forest : %.3f",rmsle(bike_test[, nb_col], y_pred_rf))
## [1] "Random Forest : 0.693"
# number of features
n_features <- nb_col

# tuning grid
tuning_grid <- expand.grid(
  trees = seq(10, 1000, by = 20),
  rmse  = NA
)

# Looping through the grid
for(i in seq_len(nrow(tuning_grid))) {

  # Fit a random forest for each hyperparameter value for the number of trees
  fit <- ranger(
    formula = Rented.Bike.Count ~ ., 
    data = bike_train, 
    num.trees = tuning_grid$trees[i],
    mtry = floor(n_features / 3),
    respect.unordered.factors = 'order',
    verbose = FALSE,
    seed = 123
  )
  
  # Extract OOB RMSE
  tuning_grid$rmse[i] <- sqrt(fit$prediction.error)
}

ggplot(tuning_grid, aes(trees, rmse)) +
  geom_line(size = 1) +
  ylab("OOB Error (RMSE)") +
  xlab("Number of trees")

# tuning grid
tuning_grid <- expand.grid(
  trees = seq(10, 1000, by = 10),
  mtry  = c(5, 10, 15, 20, 21, 30),
  rmse  = NA
)

# Looping through the grid
for(i in seq_len(nrow(tuning_grid))) {
  
  # Fit a random forest for each nb of trees and mtry values
  fit <- ranger(
  formula    = Rented.Bike.Count ~ ., 
  data       = bike_train, 
  num.trees  = tuning_grid$trees[i],
  mtry       = tuning_grid$mtry[i],
  respect.unordered.factors = 'order',
  verbose    = FALSE,
  seed       = 23
)
  # Extract OOB RMSE
  tuning_grid$rmse[i] <- sqrt(fit$prediction.error)
  
}

labels <- tuning_grid %>%
  filter(trees == 990) %>%
  mutate(mtry = as.factor(mtry))

#Plot of the grid search
tuning_grid %>%
  mutate(mtry = as.factor(mtry)) %>%
  ggplot(aes(trees, rmse, color = mtry)) +
  geom_line(size = 1, show.legend = T) +
  ggrepel::geom_text_repel(data = labels, aes(trees, rmse, label = mtry), nudge_x = 50, show.legend = FALSE) +
  ylab("OOB Error (RMSE)") +
  xlab("Number of trees") +
  labs(title = "Grid search on ntree and mtry")